Overview:
The aim of this work is to study the affect of other variables on diamond`s price.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.0.4 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(patchwork)
library(skimr)
diamonds <- read_csv("https://raw.githubusercontent.com/tidyverse/ggplot2/master/data-raw/diamonds.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## carat = col_double(),
## cut = col_character(),
## color = col_character(),
## clarity = col_character(),
## depth = col_double(),
## table = col_double(),
## price = col_double(),
## x = col_double(),
## y = col_double(),
## z = col_double()
## )
cut_levels = c("Fair", "Good", "Very Good", "Premium", "Ideal")
color_levels = c("D", "E", "F", "G", "H", "I", "J")
clarity_levels = c("I1", "SI2", "SI1", "VS2", "VS1", "VVS2", "VVS1", "IF")
diamonds <- diamonds %>%
mutate(
cut = factor(cut, levels = cut_levels, ordered = TRUE),
color = factor(color, levels = color_levels, ordered = TRUE),
clarity = factor(clarity, levels = clarity_levels, ordered = TRUE))
library(skimr)
diamonds %>%
skim()
| Name | Piped data |
| Number of rows | 53940 |
| Number of columns | 10 |
| _______________________ | |
| Column type frequency: | |
| factor | 3 |
| numeric | 7 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| cut | 0 | 1 | TRUE | 5 | Ide: 21551, Pre: 13791, Ver: 12082, Goo: 4906 |
| color | 0 | 1 | TRUE | 7 | G: 11292, E: 9797, F: 9542, H: 8304 |
| clarity | 0 | 1 | TRUE | 8 | SI1: 13065, VS2: 12258, SI2: 9194, VS1: 8171 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| carat | 0 | 1 | 0.80 | 0.47 | 0.2 | 0.40 | 0.70 | 1.04 | 5.01 | ▇▂▁▁▁ |
| depth | 0 | 1 | 61.75 | 1.43 | 43.0 | 61.00 | 61.80 | 62.50 | 79.00 | ▁▁▇▁▁ |
| table | 0 | 1 | 57.46 | 2.23 | 43.0 | 56.00 | 57.00 | 59.00 | 95.00 | ▁▇▁▁▁ |
| price | 0 | 1 | 3932.80 | 3989.44 | 326.0 | 950.00 | 2401.00 | 5324.25 | 18823.00 | ▇▂▁▁▁ |
| x | 0 | 1 | 5.73 | 1.12 | 0.0 | 4.71 | 5.70 | 6.54 | 10.74 | ▁▁▇▃▁ |
| y | 0 | 1 | 5.73 | 1.14 | 0.0 | 4.72 | 5.71 | 6.54 | 58.90 | ▇▁▁▁▁ |
| z | 0 | 1 | 3.54 | 0.71 | 0.0 | 2.91 | 3.53 | 4.04 | 31.80 | ▇▁▁▁▁ |
diamonds %>%
ggplot(aes(x = price)) +
geom_histogram() +
scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
diamonds %>%
select(where(is.numeric)) %>%
mutate(id = row_number()) %>%
pivot_longer( cols = -id) %>%
ggplot(aes(x = value)) +
geom_histogram() +
facet_wrap(~name) +
facet_wrap(~name, scales = "free")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
diamonds %>%
select(where(is.numeric)) %>%
mutate(id = row_number()) %>%
pivot_longer( cols = -id)%>%
ggplot(aes(x = value)) +
geom_boxplot() +
facet_wrap(~name) +
facet_wrap(~name,
scales = "free")
diamonds %>%
select(where(is.factor)) %>%
mutate(id = row_number()) %>%
pivot_longer(cols = -id, values_ptypes = list(value = 'character')) %>%
mutate(value = factor(value, c(cut_levels, color_levels, clarity_levels), ordered = TRUE)) %>%
ggplot(aes(y = value)) +
geom_bar() +
facet_wrap(~name) +
facet_wrap(~name, scales = "free")
| feature | issue | potential soluation | comment |
|---|---|---|---|
| price | Right skewed | take log | result is bimodal |
| carat | Right skewed | take log | |
| depth | Outlier (right-left) | is it valid? | |
| table | Outlier (right-left) | is it valid? | |
| x | Outlier (right-left) | is it valid? | |
| y | Outlier (right-left) | is it valid? | |
| z | Outlier (right-left) | is it valid? | |
| clarity | very view of I1,IF |
||
| color | |||
| cut | very view of Fair |
price_carat<-diamonds %>%
ggplot(aes(x = carat,
y = price )) +
geom_point() +
ggtitle("before log10")
price_carat_log<-diamonds %>%
ggplot(aes(x = carat,
y = price )) +
geom_point() +
scale_y_log10() +
scale_x_log10() +
ggtitle("after log10")
price_carat + price_carat_log
there is a strong relationship between Carat and price correlation is0.9215913 and after we take the log of both correlation is 0.9659137.
# install.packages("patchwork")
# install.packages("ggridges")
library(ggridges)
library(patchwork)
price_cut_box<-diamonds %>%
ggplot(aes(x = cut,
y = price)) +
geom_boxplot() +
scale_y_log10()
price_cut_dens<- diamonds %>%
ggplot(aes(y = cut,
x = price)) +
geom_density_ridges() +
scale_x_log10()
price_cut_box + coord_flip() + price_cut_dens
## Picking joint bandwidth of 0.0613
why as the cut quality increases median price decreases?
p1<-diamonds %>%
ggplot(aes(y = price, x = carat)) +
geom_jitter() +
scale_x_log10() +
scale_y_log10()
p2<-diamonds %>%
ggplot(aes(y = price, x = cut)) +
geom_boxplot() +
scale_y_log10()
p3<-diamonds %>%
ggplot(aes(y = carat, x = cut)) +
geom_boxplot() +
scale_y_log10()
p4<- diamonds %>%
ggplot(aes(x = log10(carat), y = log10(price), color = cut)) +
geom_jitter(alpha = .2) +
geom_smooth()
(p1 + p2 + p3)/p4
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
diamonds %>% #BREAK
mutate(carat = cut_width(carat, 1)) %>% #BREAK
ggplot(aes(x = price,
fill = carat)) + #BREAK
geom_histogram() +#BREAK
scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.